Main statistical analysis
Data preparation
Create dataset for modeling the main experiment data. Filter out control data (which were analyzed above), then make a new variable target humidity coded as an ordered factor.
final.glm.data <- dplyr::filter(exp_data, time_min != 0)
final.glm.data$target_hum_factor <- as.character(final.glm.data$target_hum_factor)
final.glm.data$target_hum_factor <- factor(final.glm.data$target_hum_factor,
levels = c("High", "Mid_High", "Mid_Low","Dry"))
Create binary “success” variable, where success is spectro intensity readings greater than our calculated LOD (above)
final.glm.data$success <- as.numeric(
final.glm.data$int_2wk > dplyr::filter(LODs.feather.paper, type == "Feather")$LOD
)
Create numeric time variable
final.glm.data$time_min_num <- as.numeric(final.glm.data$time_min)
Re-scale the humidity variable so that it is expressed in percent
final.glm.data$scaled.hum <- 100*final.glm.data$actual_hum_mean
Construct GLM
Reviewers of the first version of the manuscript pointed out that vapour pressure deficit (VPD) is more meaningful than relative humidity (RH), so we’ll repeat all the main analyses below using VPD in lieu of RH. These new analyses appear after the original ones.
Model using RH
The full model including interaction term is the appropriate model given the factorial experiment.
glm.full <- glm(data = final.glm.data, success ~ scaled.hum * time_min_num, family = "binomial")
Get model output
summary(glm.full)
##
## Call:
## glm(formula = success ~ scaled.hum * time_min_num, family = "binomial",
## data = final.glm.data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.00101 -0.25909 -0.02339 0.52884 2.16979
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.4202972 1.6297249 -2.099 0.03584 *
## scaled.hum 0.0587368 0.0217283 2.703 0.00687 **
## time_min_num -0.0895784 0.0404624 -2.214 0.02684 *
## scaled.hum:time_min_num 0.0009251 0.0004626 2.000 0.04553 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 146.114 on 127 degrees of freedom
## Residual deviance: 71.502 on 124 degrees of freedom
## AIC: 79.502
##
## Number of Fisher Scoring iterations: 8
Get confidence intervals on coefficients:
confint(glm.full)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) -7.3554860269 -0.717267069
## scaled.hum 0.0221713102 0.110171403
## time_min_num -0.1848864629 -0.022631338
## scaled.hum:time_min_num 0.0001427493 0.002004403
For comparison, use the likelihood ratio test approach:
glm.null <- glm(data = final.glm.data, success ~ 1, family = "binomial")
glm.hum <- glm(data = final.glm.data, success ~ scaled.hum, family = "binomial")
glm.hum_time <- glm(data = final.glm.data, success ~ scaled.hum + time_min_num, family = "binomial")
Model comparisons:
anova(glm.null, glm.hum, test = "LRT")
## Analysis of Deviance Table
##
## Model 1: success ~ 1
## Model 2: success ~ scaled.hum
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 127 146.114
## 2 126 88.829 1 57.285 3.771e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(glm.hum, glm.hum_time, test = "LRT")
## Analysis of Deviance Table
##
## Model 1: success ~ scaled.hum
## Model 2: success ~ scaled.hum + time_min_num
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 126 88.829
## 2 125 77.155 1 11.674 0.000634 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(glm.hum_time, glm.full, test = "LRT")
## Analysis of Deviance Table
##
## Model 1: success ~ scaled.hum + time_min_num
## Model 2: success ~ scaled.hum * time_min_num
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 125 77.155
## 2 124 71.502 1 5.6533 0.01742 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Calculate “pseudo-R-square” values using the pR2 function in the pscl package:
pscl::pR2(glm.full)
## llh llhNull G2 McFadden r2ML r2CU
## -35.7510184 -73.0568196 74.6116024 0.5106409 0.4417247 0.6489611
Model using VPD (used for geospatial predictions)
The full model including interaction term is the appropriate model given the factorial experiment.
glm.full.vpd <- glm(data = final.glm.data, success ~ vpd * time_min_num, family = "binomial")
Get model output
summary(glm.full.vpd)
##
## Call:
## glm(formula = success ~ vpd * time_min_num, family = "binomial",
## data = final.glm.data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.00389 -0.26468 -0.02623 0.52777 2.20346
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.448012 0.765232 3.199 0.00138 **
## vpd -2.111239 0.776629 -2.718 0.00656 **
## time_min_num 0.002575 0.007195 0.358 0.72047
## vpd:time_min_num -0.032695 0.016455 -1.987 0.04692 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 146.114 on 127 degrees of freedom
## Residual deviance: 71.713 on 124 degrees of freedom
## AIC: 79.713
##
## Number of Fisher Scoring iterations: 8
Get confidence intervals on coefficients:
confint(glm.full.vpd)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 1.05849016 4.098242055
## vpd -3.94683956 -0.802387251
## time_min_num -0.01131228 0.017634614
## vpd:time_min_num -0.07113824 -0.004893459
For comparison, use the likelihood ratio test approach:
glm.null.vpd <- glm(data = final.glm.data, success ~ 1, family = "binomial")
glm.vpd <- glm(data = final.glm.data, success ~ vpd, family = "binomial")
glm.vpd_time <- glm(data = final.glm.data, success ~ vpd + time_min_num, family = "binomial")
Model comparisons:
anova(glm.null.vpd, glm.vpd, test = "LRT")
## Analysis of Deviance Table
##
## Model 1: success ~ 1
## Model 2: success ~ vpd
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 127 146.11
## 2 126 89.01 1 57.104 4.134e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(glm.vpd, glm.vpd_time, test = "LRT")
## Analysis of Deviance Table
##
## Model 1: success ~ vpd
## Model 2: success ~ vpd + time_min_num
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 126 89.010
## 2 125 77.296 1 11.713 0.0006206 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(glm.vpd_time, glm.full.vpd, test = "LRT")
## Analysis of Deviance Table
##
## Model 1: success ~ vpd + time_min_num
## Model 2: success ~ vpd * time_min_num
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 125 77.296
## 2 124 71.713 1 5.5836 0.01813 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Calculate “pseudo-R-square” values using the pR2 function in the pscl package:
pscl::pR2(glm.full.vpd)
## llh llhNull G2 McFadden r2ML r2CU
## -35.8563993 -73.0568196 74.4008405 0.5091985 0.4408047 0.6476095
Note that the two approaches (RH versus VPD) yield qualitatively identical results, and quantitatively very similar results.
Visualize the results
First we’ll look at the interaction plot of the main experiment results.
Here we use the ggeffects package and its ggpredict function, which provides predictions for the focal variable holding the others constant. It also provides 95% confidence bands:
We’ll generate predictions using the original “time” (= “duration”) values in the experiment, and the actual (measured) values of mean relative humidity per group.
We use the ggpredict function from the ggeffects package.
full.glm.ggpredict <- ggeffects::ggpredict(glm.full,
terms = c("time_min_num [10, 60, 120, 240]", "scaled.hum [8, 36, 71, 88]"))
full.glm.ggpredict
##
## # Predicted probabilities of success
## # x = time_min_num
##
## # scaled.hum = 36
## x predicted std.error conf.low conf.high
## 10 0.134 0.795 0.031 0.423
## 60 0.009 1.182 0.001 0.086
## 120 0.000 2.470 0.000 0.039
## 240 0.000 5.283 0.000 0.011
##
## # scaled.hum = 71
## x predicted std.error conf.low conf.high
## 10 0.625 0.415 0.425 0.790
## 60 0.335 0.408 0.185 0.529
## 120 0.107 0.796 0.025 0.364
## 240 0.007 1.768 0.000 0.179
##
## # scaled.hum = 8
## x predicted std.error conf.low conf.high
## 10 0.022 1.307 0.002 0.230
## 60 0.000 1.894 0.000 0.015
## 120 0.000 3.844 0.000 0.005
## 240 0.000 8.147 0.000 0.001
##
## # scaled.hum = 88
## x predicted std.error conf.low conf.high
## 10 0.841 0.550 0.643 0.940
## 60 0.779 0.417 0.608 0.889
## 120 0.683 0.366 0.513 0.816
## 240 0.447 0.656 0.183 0.745
## Standard errors are on link-scale (untransformed).
And using VPD (using the mean values of VPD per target humidity group; see Table 3 above):
Note that we’ll need to reverse the ordering of the VPD values after, to correspond with order of increasing RH values:
full.glm.vpd.ggpredict <- ggeffects::ggpredict(glm.full.vpd,
terms = c("time_min_num [10, 60, 120, 240]", "vpd [0.32, 0.80, 1.78, 2.53]"))
full.glm.vpd.ggpredict
##
## # Predicted probabilities of success
## # x = time_min_num
##
## # vpd = 0.32
## x predicted std.error conf.low conf.high
## 10 0.845 0.556 0.647 0.942
## 60 0.786 0.423 0.616 0.894
## 120 0.695 0.372 0.524 0.826
## 240 0.470 0.660 0.195 0.764
##
## # vpd = 0.8
## x predicted std.error conf.low conf.high
## 10 0.628 0.415 0.428 0.792
## 60 0.342 0.404 0.190 0.534
## 120 0.112 0.784 0.026 0.370
## 240 0.007 1.741 0.000 0.184
##
## # vpd = 1.78
## x predicted std.error conf.low conf.high
## 10 0.134 0.791 0.032 0.422
## 60 0.009 1.171 0.001 0.087
## 120 0.000 2.448 0.000 0.040
## 240 0.000 5.238 0.000 0.012
##
## # vpd = 2.53
## x predicted std.error conf.low conf.high
## 10 0.024 1.280 0.002 0.234
## 60 0.000 1.849 0.000 0.017
## 120 0.000 3.756 0.000 0.006
## 240 0.000 7.965 0.000 0.001
## Standard errors are on link-scale (untransformed).
Now let’s reverse the ordering of the VPD levels:
full.glm.vpd.ggpredict$group <- factor(full.glm.vpd.ggpredict$group, levels = rev(levels(full.glm.vpd.ggpredict$group)))
Now we can visualize these predictions, first using RH, then using VPD:
labels.facet <- c("Mean RH = 8%", "Mean RH = 36%", "Mean RH = 71%", "Mean RH = 88%")
names(labels.facet) <- c("8", "36", "71", "88")
model.plot <- ggplot(full.glm.ggpredict, aes(x = x, y = predicted)) +
geom_point(position = position_dodge(.2), shape = 1, size = 2.5) +
theme_bw() +
geom_errorbar(aes(ymin = conf.low, ymax = conf.high), width = 2) +
labs(x = "Exposure time (minutes)", y = "Probability of diatoms being viable") +
ylim(0, 1) +
scale_x_continuous(breaks = c(10, 60, 120, 240), limits = c(-10, 260)) +
theme(panel.grid.major.y = element_line(colour = "lightgrey", size = (0.25)),
panel.grid.major.x = element_line(colour = "lightgrey", size = (0.25)),
panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank(),
panel.background = element_blank(),
panel.border = element_rect(colour = "black", fill = NA)) +
facet_grid(. ~ group, labeller = labeller(group = labels.facet))
model.plot
Now VPD:
labels.facet <- c("Mean VPD = 2.53", "Mean VPD = 1.78", "Mean VPD = 0.80", "Mean VPD = 0.32")
names(labels.facet) <- c("2.53", "1.78", "0.8", "0.32")
Fig_S6_vpd.glm.plot <- ggplot(full.glm.vpd.ggpredict, aes(x = x, y = predicted)) +
geom_point(position = position_dodge(.2), shape = 1, size = 2.5) +
theme_bw() +
geom_errorbar(aes(ymin = conf.low, ymax = conf.high), width = 2) +
labs(x = "Exposure time (minutes)", y = "Probability of diatoms being viable") +
ylim(0, 1) +
scale_x_continuous(breaks = c(10, 60, 120, 240), limits = c(-10, 260)) +
theme(panel.grid.major.y = element_line(colour = "lightgrey", size = (0.25)),
panel.grid.major.x = element_line(colour = "lightgrey", size = (0.25)),
panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank(),
panel.background = element_blank(),
panel.border = element_rect(colour = "black", fill = NA)) +
facet_grid(. ~ group, labeller = labeller(group = labels.facet))
Fig_S6_vpd.glm.plot
Predict across RH, VPD and distance
Create a dataframe with RH values from 75 to 95% (which are based on analyses of real RH data from study region; see below), VPD values of 0.1 to 0.5 (also typical of study region), and average flight speed of 69 km/h (see manuscript), and exposure times from 10 to 240 minutes. Then use our GLM model to predict distances travelled.
disp.distances <- expand.grid(time_min_num = 10:240,
vpd = seq(from = 0.25, to = 1.25, length = 5), flight.speed = 69)
disp.distances$predicted.prob.vpd <- round(predict(glm.full.vpd,
newdata = disp.distances, type = "response"), 4)
disp.distances$dist <- disp.distances$time_min_num * disp.distances$flight.speed / 60
For plotting, create a factor variable for VPD, and re-order levels such that VPD ordered high values to low values:
disp.distances$vpd.factor <- as.factor(disp.distances$vpd)
disp.distances$vpd.factor <- ordered(disp.distances$vpd.factor, levels = rev(as.character(seq(from = 0.25, to = 1.25, length = 5))))
Visualize probability as a function of distance for different values of VPD, using average flight speed of 69 km/h.
Fig3A.dist.prob.vpd <- ggplot(data = disp.distances, aes(y = predicted.prob.vpd,
x = dist, group = vpd.factor)) +
scale_y_continuous(breaks = seq(0, 1, by = 0.1)) +
scale_x_continuous(breaks = seq(0, 320, by = 40)) +
geom_line(aes(linetype = vpd.factor), size = 0.8) +
scale_linetype_manual(values = 5:1, labels = rev(as.character(seq(from = 0.25, to = 1.25, length = 5))),
name = "Vapour pressure deficit (kPa)") +
theme(legend.background = element_rect(colour = 'black'),
legend.key = element_rect(fill = 'white'),
legend.position = "top", legend.key.size = unit(1, "cm"),
legend.title = element_text(size = 9),
legend.text = element_text(size = 7)) +
guides(linetype = guide_legend(nrow = 1, reverse = T,
title.position = "top", title.hjust = 0.5, title.vjust = - 1)) +
theme(panel.grid.major.y = element_line(colour = "lightgrey", size = (0.25)),
panel.grid.major.x = element_line(colour = "lightgrey", size = (0.25)),
panel.background = element_blank(),
panel.border = element_rect(colour = "black", fill = NA)) +
labs(x = "Distance travelled by mallard (km)", y = "Probability of diatoms remaining viable") +
theme(axis.text.x = element_text(size = 7)) +
theme(axis.text.y = element_text(size = 7)) +
theme(axis.title.x = element_text(size = 9)) +
theme(axis.title.y = element_text(size = 9)) +
annotate("text", x = 290, y = 0.95, label = "A")
Fig3A.dist.prob.vpd
Ordinal factor GLM
Here we report the results from coding the time and humidity predictors as ordinal factors, instead of continuous numeric variables.
We conduct likelihood ratio tests (LRTs) to compare models that use ordinal factors.
Create ordinal factor variables:
final.glm.data$time_min_fact <- as.factor(final.glm.data$time_min)
Build models using treatment contrasts:
glm.full.fact <- glm(data = final.glm.data, success ~ target_hum_factor * time_min_fact,
family = "binomial")
glm.null.fact <- glm(data = final.glm.data, success ~ 1, family = "binomial")
glm.hum.fact <- glm(data = final.glm.data, success ~ target_hum_factor,
family = "binomial")
glm.hum_time.fact <- glm(data = final.glm.data, success ~ target_hum_factor + time_min_fact,
family = "binomial")
Now compare fits using LRTs:
anova(glm.null.fact, glm.hum.fact, glm.hum_time.fact, glm.full.fact, test = "LRT")
## Analysis of Deviance Table
##
## Model 1: success ~ 1
## Model 2: success ~ target_hum_factor
## Model 3: success ~ target_hum_factor + time_min_fact
## Model 4: success ~ target_hum_factor * time_min_fact
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 127 146.114
## 2 124 88.976 3 57.137 2.402e-12 ***
## 3 121 73.135 3 15.842 0.001222 **
## 4 112 57.665 9 15.470 0.078813 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
We see that the full model that includes the interaction between humidity and time gains slightly less support than it did when coding the predictors as numeric. This is to be expected because in this approach the model uses a degree of freedom for each level (minus 1) for each term.
NOTE There is no need to repeat the preceding analysis with VPD, because the ordinal factor “target_hum_factor” would serve identically.
Aspatial and spatial predictions
With the aim of placing our experiment results into real-world contexts we sought relevant data pertaining to North and South Dakota and Nebraska. This region encompasses the prairie pothole region of the USA, an important breeding ground, and is also within the central migration flyway. We sought information about the following:
- relative humidity and air temperature throughout the study region and during the key months
- distribution and flight data for mallard ducks
- distribution of surface waters (potential habitat for ducks and diatoms)
- distribution of the genus Nitzschia and of the species Nitzschia pussella in surface waters
With the relative humidity and air temperature data, we could calculate corresponding values of vapour pressure deficit.
When combined, this information can help predict the potential for mallard-borne diatom dispersal within region.
Import data
Relative humidity data
Here we access online data about relative humidity and air temperature within the states of interest, and during the spring months corresponding to arrival of ducks into nesting areas (April) or during stopovers for continued migration north, and during nesting times (through end of May).
We will use the riem package, described here: https://docs.ropensci.org/riem/
We will retrieve relative humidity and air temperature data for the months of April through May inclusive for each state.
Get the ASOS networks (state-level) and stations (airports) of interest.
First, we know from the online vignette https://docs.ropensci.org/riem/articles/riem.html that the networks have a “code” that is simply the state abbreviation appended to “ASOS” with an underscore:
network.codes <- paste(c("ND", "SD", "NE"), "ASOS", sep = "_")
So, for our states of interest, we can get the station codes as follows:
asos.stations <- riem::riem_stations(network = network.codes[1])
asos.stations$state = unlist(strsplit(network.codes[1], "_"))[1]
for (i in 2:length(network.codes)) {
temp <- riem::riem_stations(network = network.codes[i])
temp$state <- unlist(strsplit(network.codes[i], "_"))[1]
asos.stations <- bind_rows(asos.stations, temp)
}
Set the date values (we’ll use the last six years of data)
data.years <- seq(2015, 2020, by = 1)
start.mo.day <- "04-01"
end.mo.day <- "05-30"
The field name for Relative Humidity (%) is “relh”, and temperature is “tmpf” (Farenheit), and we can now download these data for the correct years of interest, then we’ll subset and average the months of interest (April through May inclusive).
First s Loop through downloading for years and stations:
NOTE This takes > 10 minutes, so i’ve turned “eval = F”; change this to “eval = T” if you wish.
for (j in 1:length(data.years)) {
for (i in 1:length(asos.stations$id)) {
if (i == 1 & j == 1) {
asos.data <- as.data.frame(riem_measures(
station = asos.stations$id[i],
date_start = paste(as.character(data.years[j]), start.mo.day, sep = "-"),
date_end = paste(as.character(data.years[j]),
end.mo.day, sep = "-")))[,c("station", "valid", "relh", "tmpf")] } else {
asos.data <- as.data.frame(rbind(asos.data,
as.data.frame(riem_measures(station = asos.stations$id[i],
date_start = paste(as.character(data.years[j]),
start.mo.day, sep = "-"), date_end = paste(as.character(data.years[j]),
end.mo.day, sep = "-")))[,c("station", "valid", "relh", "tmpf")]))
}
}
print(paste("done j loop", j, sep = " "))
}
Write out file for future use; here “eval = F”; change “eval = T” to run.
asos.data <- dplyr::filter(asos.data, relh != "null")
write.csv(asos.data, "../rawdata/ASOS_data_from_web.csv")
Mallard flight data
Import data from Viana et al. (2013) from Dryad https://datadryad.org/resource/doi:10.5061/dryad.619gd and create new dataframe containing only relevant records (mallards, North America)
NOTE: on the Dryad page linked above, you can find out the correct URL address for the files of interest by clicking on the “data files” link on the right of the webpage, then right-clicking the file name and “copy link address”
I was unable to derive this information directly using rdryad functions, perhaps because this is an older 2013 repository page that was not structured properly? In any case, the code below is reproducible.
# banding data file
rdryad::dryad_fetch(url = 'https://datadryad.org/stash/downloads/file_stream/39802',
(tempfile1 <- tempfile(fileext = ".txt")))
## $`https://datadryad.org/stash/downloads/file_stream/39802`
## [1] "/var/folders/xd/lp87q1k11r1d8tbk952jkf4m0000gp/T//Rtmp7TedKg/file730e2d066613.txt"
file.copy(tempfile1, "../rawdata/banding_data.txt")
## [1] FALSE
# IF YOU WANT, download readme file for banding data
rdryad::dryad_fetch(url = 'https://datadryad.org/stash/downloads/file_stream/39803',
(tempfile2 <- tempfile(fileext = ".rtf")))
## $`https://datadryad.org/stash/downloads/file_stream/39803`
## [1] "/var/folders/xd/lp87q1k11r1d8tbk952jkf4m0000gp/T//Rtmp7TedKg/file730e7f17807c.rtf"
file.copy(tempfile2, "../rawdata/banding_data_readme.rtf")
## [1] FALSE
banding.data <- read.delim("../rawdata/banding_data.txt")
mallard.data <- filter(banding.data, Region == "North America" & Species == "Anas platyrhynchos")
Surface waters and reference maps
Import administrative boundaries, lakes, and flyways data using the sf package function st_read:
borders <- sf::st_read(
"../rawdata/spatial/admin_shapes/ne_10m_admin_1_states_provinces_lakes.shp")
## Reading layer `ne_10m_admin_1_states_provinces_lakes' from data source `/Users/jpither/OneDrive - The University Of British Columbia/work/students/faye/diatom/rawdata/spatial/admin_shapes/ne_10m_admin_1_states_provinces_lakes.shp' using driver `ESRI Shapefile'
## Simple feature collection with 4593 features and 83 fields
## geometry type: MULTIPOLYGON
## dimension: XY
## bbox: xmin: -180 ymin: -90 xmax: 180 ymax: 83.6341
## CRS: 4326
small.lakes <- sf::st_read("../rawdata/spatial/glwd_2.shp") # small lakes
## Reading layer `glwd_2' from data source `/Users/jpither/OneDrive - The University Of British Columbia/work/students/faye/diatom/rawdata/spatial/glwd_2.shp' using driver `ESRI Shapefile'
## Simple feature collection with 244892 features and 7 fields
## geometry type: POLYGON
## dimension: XY
## bbox: xmin: -180 ymin: -55.58721 xmax: 180 ymax: 83.57595
## CRS: 4269
large.lakes <- sf::st_read("../rawdata/spatial/glwd_1.shp") # large lakes; missing CRS, so assign
## Reading layer `glwd_1' from data source `/Users/jpither/OneDrive - The University Of British Columbia/work/students/faye/diatom/rawdata/spatial/glwd_1.shp' using driver `ESRI Shapefile'
## Simple feature collection with 3721 features and 29 fields
## geometry type: MULTIPOLYGON
## dimension: XY
## bbox: xmin: -178.8711 ymin: -54.6059 xmax: 179.995 ymax: 81.98179
## CRS: NA
Map wrangling
Large lakes dataset missing coordinate system definition, so assign using small lakes dataset:
st_crs(large.lakes) <- sf::st_crs(small.lakes)
Extract USA, Canada, and Mexico eliminate extra fields, and put borders in same coordinate system as lakes
north.america <- dplyr::filter(borders,
admin == "United States of America" | admin == "Canada" | admin == "Mexico")
north.america <- north.america[, c(1:60, 84)]
north.america <- sf::st_transform(north.america, st_crs(small.lakes))
Simplify borders to save space using the ms_simplify function from the rmapshaper package:
north.america <- rmapshaper::ms_simplify(north.america, keep = 0.1)
And exclude Hawaii and Aleutian islands:
north.america <- dplyr::filter(north.america, name != "Hawaii")
north.america <- sf::st_crop(north.america, xmin = -180, xmax = 0, ymin = -90, ymax = 90)
## although coordinates are longitude/latitude, st_intersection assumes that they are planar
## Warning: attribute variables are assumed to be spatially constant
## throughout all geometries
Re-project North America to Lambert Conformal Conic for creating an overview map:
north.america.lamb <- sf::st_transform(north.america,
crs = "+proj=lcc +lat_1=20 +lat_2=60 +lat_0=40 +lon_0=-96 +x_0=0 +y_0=0 +ellps=GRS80")
Make outer border map layer for overview map:
north.america.lamb.outer <- sf::st_union(north.america.lamb)
Now create map of just the focal states:
borders.usa <- dplyr::filter(north.america, admin == "United States of America")
study.area.us <- dplyr::filter(borders.usa,
name == "North Dakota" | name == "South Dakota" | name == "Nebraska")
Create outer border layer of study region
study.area.outer.border <- sf::st_union(study.area.us)
Crop the lakes to study region:
temp <- sf::st_covered_by(small.lakes, study.area.outer.border, sparse = F)
## although coordinates are longitude/latitude, st_covered_by assumes that they are planar
small.lakes.central <- small.lakes[apply(temp,1,function(x){sum(x)}) > 0,]
temp <- sf::st_covered_by(large.lakes, study.area.outer.border, sparse = F)
## although coordinates are longitude/latitude, st_covered_by assumes that they are planar
large.lakes.central <- large.lakes[apply(temp,1,function(x){sum(x)}) > 0,]
Combine all lakes together in one layer
study.area.lakes <- rbind(small.lakes.central[,intersect(names(small.lakes.central),
names(large.lakes.central))],
large.lakes.central[,intersect(names(small.lakes.central),
names(large.lakes.central))])
rm(temp) # remove temporary layer
Make coordinate system consistent among datasets, using a projection that is appropriate for distance calculations:
Put all datasets into north america equidistant conic (see https://spatialreference.org/ref/esri/102010/):
study.area.lakes.projected <- sf::st_transform(study.area.lakes, 102010)
study.area.us.projected <- sf::st_transform(study.area.us, 102010)
study.area.outer.border.projected <- sf::st_transform(study.area.outer.border, 102010)
Calculate distance between nearest-neighbour waters
Now get pairwise distances among all lakes in study area and its nearest neighbour lake, using the spatstat package function nndist:
# gets distances between focal lake and its nearest neighbour
temp <- sf::st_coordinates(st_centroid(study.area.lakes.projected))
## Warning in st_centroid.sf(study.area.lakes.projected): st_centroid assumes
## attributes are constant over geometries of x
study.area.lakes.projected$min.nbr.distance.km <- spatstat::nndist(temp) /1000
Plot histogram of nearest neigbour distances:
neighbour.dist.hist <- ggplot(data = study.area.lakes.projected, aes(x = min.nbr.distance.km)) +
theme_bw() +
# geom_density() +
geom_histogram(col = "black", fill = "darkgrey", alpha = .1, breaks = seq(0, 100, by = 10)) +
scale_x_continuous(breaks = seq(0, 100, by = 10)) +
theme(panel.grid.major.y = element_line(colour = "lightgrey", size = (0.25)),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank()) +
labs(x = "Distance between nearest \nneighbour water bodies (km)", y = "Number of water bodies") +
annotate("text", x = 100, y = 1050, label = "C") +
theme(axis.text.x = element_text(size = 7)) +
theme(axis.text.y = element_text(size = 7)) +
theme(axis.title.x = element_text(size = 9)) +
theme(axis.title.y = element_text(size = 9))
neighbour.dist.hist
Calculating VPD from RH and temperature
Format date field. Make use of the lubridate package.
Note that the raw data provide times in UTC, so these need to be converted to Central Daylight Time (appropriate for the region and the months of April and May).
asos.data <- read.csv("../rawdata/ASOS_data_from_web.csv", header = T, row.names = 1)
asos.data$valid <- ymd_hms(asos.data$valid, tz = "UTC")
asos.data$date <- base::as.Date(strptime(asos.data$valid,"%Y-%m-%d %H:%M"))
asos.data$hour.of.day <- lubridate::hour(asos.data$valid)
## change to Central Daylight Time (for April - May in the region)
## NOTE: the "with_tz" function knows to adjust Daylight Savings based on the date
asos.data$cdt.date <- lubridate::date(with_tz(asos.data$valid, tz = "US/Central"))
asos.data$cdt.hour.of.day <- lubridate::hour(with_tz(asos.data$valid, tz = "US/Central"))
# get month and day
#asos.data$month.day <- format(as.Date(asos.data$cdt.date), "%m-%d")
Join data to station info:
asos.data <- dplyr::left_join(asos.data, asos.stations, by = c("station" = "id"))
## Warning: Column `station`/`id` joining factor and character vector,
## coercing into character vector
We will use three contrasting times of day for illustration: the early morning hours that tend to correspond to high flight activity and minimum daily VPD, mid-day, and evening.
First convert Farenheit to Celsius
asos.data$temp_C <- (5/9)*(asos.data$tmpf - 32)
First filter to include morning 4am to 7am times, mid-day (noon to 3pm), and dusk (8 to 11pm):
asos.data.dawn.pm <- dplyr::filter(asos.data, cdt.hour.of.day >= 4 & cdt.hour.of.day <= 7 | cdt.hour.of.day >= 12 & cdt.hour.of.day <= 15 | cdt.hour.of.day >= 20 & cdt.hour.of.day <= 23)
asos.data.dawn.pm$week <- lubridate::week(asos.data.dawn.pm$date)
Now let’s get basic descriptive stats for guiding further analyses.
How many stations per state?
stations.per.state <- lapply(sapply(unique(asos.data.dawn.pm$state),
function(x){table(asos.data.dawn.pm[asos.data.dawn.pm$state == x, "station"])}), length)
stations.per.state
## $ND
## [1] 37
##
## $SD
## [1] 21
##
## $NE
## [1] 40
Now calculate vapour pressure deficit values based on RH and temperature, again using the bigleaf R package
asos.data.dawn.pm$vpd <- bigleaf::rH.to.VPD(asos.data.dawn.pm$relh/100, asos.data.dawn.pm$temp_C)
Create grouping variable for different times of day
asos.data.dawn.pm$time_of_day_group <- factor(ifelse(asos.data.dawn.pm$cdt.hour.of.day < 10, "Dawn", ifelse(asos.data.dawn.pm$cdt.hour.of.day > 16, "Dusk", "Mid-day")), levels = c("Dawn", "Mid-day", "Dusk"))
Calculate average RH and temperature grouped by weather station, time of day, and week:
asos.data.dawn.pm.mean.station.week <- asos.data.dawn.pm %>%
dplyr::group_by(station, time_of_day_group, week) %>%
dplyr::summarise(mean.rh = mean(relh, na.rm = T),
sampsize.rh = n(),
mean.tempc = mean(temp_C, na.rm = T),
sampsize.tempc = n(),
mean.vpd = mean(vpd, na.rm = T),
sampsize.vpd = n())
asos.data.dawn.pm.mean.state.week <- asos.data.dawn.pm %>%
dplyr::group_by(state, time_of_day_group, week) %>%
dplyr::summarise(mean.rh = mean(relh, na.rm = T),
sampsize.rh = n(),
mean.tempc = mean(temp_C, na.rm = T),
sampsize.tempc = n(),
mean.vpd = mean(vpd, na.rm = T),
sampsize.vpd = n())
And, let’s get basic descriptive stats on the RH, temperature, and VPD values from April 1 to May 31, from 2015 to 2020:
asos.data.descriptive.stats <- asos.data.dawn.pm %>%
dplyr::select(time_of_day_group, relh, temp_C, vpd) %>%
tidyr::pivot_longer(cols = c("relh", "temp_C", "vpd"), names_to = "variable", values_to = "value") %>%
dplyr::group_by(time_of_day_group, variable) %>%
dplyr::summarise(mean = mean(value, na.rm = T),
median = median(value, na.rm = T),
q1 = quantile(value, na.rm = T)[2],
q3 = quantile(value, na.rm = T)[4],
sampsize = n())
Now view the output Show the table output
# transpose table
options(knitr.kable.NA = '') # suppress showing NA values in table
kable(asos.data.descriptive.stats, booktabs = T, digits = 4,
caption = "Summary statistics for weather variables for April and May, years 2015 to 2020 inclusive",
align = "rrrrr") %>%
kable_styling(latex_options = c("striped", "hold_position"))
Summary statistics for weather variables for April and May, years 2015 to 2020 inclusive
|
time_of_day_group
|
variable
|
mean
|
median
|
q1
|
q3
|
sampsize
|
|
Dawn
|
relh
|
83.6388
|
86.5900
|
74.9300
|
95.1300
|
276392
|
|
Dawn
|
temp_C
|
5.6795
|
6.0000
|
1.1111
|
10.5000
|
276392
|
|
Dawn
|
vpd
|
0.1702
|
0.1106
|
0.0392
|
0.2434
|
276392
|
|
Mid-day
|
relh
|
52.8752
|
48.0000
|
32.8300
|
71.2800
|
272104
|
|
Mid-day
|
temp_C
|
14.3294
|
15.0000
|
9.0000
|
20.2778
|
272104
|
|
Mid-day
|
vpd
|
0.9688
|
0.8404
|
0.3174
|
1.4489
|
272104
|
|
Dusk
|
relh
|
68.3261
|
69.0400
|
52.0500
|
86.1300
|
270493
|
|
Dusk
|
temp_C
|
9.7199
|
10.0000
|
5.0000
|
15.0000
|
270493
|
|
Dusk
|
vpd
|
0.4665
|
0.3489
|
0.1290
|
0.6901
|
270493
|
So, the first and third quartiles for the VPD data are useful reference points for exploration below (along with the timeplot below).
Create time series plot of air temperature.
labels.facet <- c("Dawn (04:00 to 07:00)", "Mid-day (12:00 - 15:00)", "Dusk (20:00 to 23:00)")
names(labels.facet) <- c("Dawn", "Mid-day", "Dusk")
Fig_S7.temp.timeplot <- ggplot(data = asos.data.dawn.pm.mean.state.week,
aes(x = week, y = mean.tempc, col = state)) +
guides(col = guide_legend(title = "State")) +
geom_line() +
theme_bw() +
labs(tag = "A") +
theme(panel.grid.major.y = element_line(colour = "lightgrey", size = (0.25)),
panel.grid.major.x = element_line(colour = "lightgrey", size = (0.25)),
panel.grid.minor.x = element_blank()) +
#ylim(0, 1.5) +
theme(axis.text.x = element_text(size = 7)) +
theme(axis.text.y = element_text(size = 7)) +
theme(axis.title.x = element_text(size = 9)) +
theme(axis.title.y = element_text(size = 9)) +
scale_x_continuous(breaks = 13:22) +
labs(x = "Weeks from January 1", y = "Average air temperature (degrees Celsius)")
Fig_S7.temp.timeplot <- Fig_S7.temp.timeplot + facet_grid(. ~ time_of_day_group, labeller = labeller(time_of_day_group = labels.facet))
Fig_S7.temp.timeplot
Create time series plot of relative humidity.
labels.facet <- c("Dawn (04:00 to 07:00)", "Mid-day (12:00 - 15:00)", "Dusk (20:00 to 23:00)")
names(labels.facet) <- c("Dawn", "Mid-day", "Dusk")
Fig_S7.rh.timeplot <- ggplot(data = asos.data.dawn.pm.mean.state.week,
aes(x = week, y = mean.rh, col = state)) +
guides(col = guide_legend(title = "State")) +
geom_line() +
theme_bw() +
labs(tag = "B") +
theme(panel.grid.major.y = element_line(colour = "lightgrey", size = (0.25)),
panel.grid.major.x = element_line(colour = "lightgrey", size = (0.25)),
panel.grid.minor.x = element_blank()) +
# ylim(0, 1.5) +
theme(axis.text.x = element_text(size = 7)) +
theme(axis.text.y = element_text(size = 7)) +
theme(axis.title.x = element_text(size = 9)) +
theme(axis.title.y = element_text(size = 9)) +
scale_x_continuous(breaks = 13:22) +
labs(x = "Weeks from January 1", y = "Average relative humidity (%)")
Fig_S7.rh.timeplot <- Fig_S7.rh.timeplot + facet_grid(. ~ time_of_day_group, labeller = labeller(time_of_day_group = labels.facet))
Fig_S7.rh.timeplot
Create time series plot of vapour pressure deficit.
labels.facet <- c("Dawn (04:00 to 07:00)", "Mid-day (12:00 - 15:00)", "Dusk (20:00 to 23:00)")
names(labels.facet) <- c("Dawn", "Mid-day", "Dusk")
Fig_S7.vpa.timeplot <- ggplot(data = asos.data.dawn.pm.mean.state.week,
aes(x = week, y = mean.vpd, col = state)) +
guides(col = guide_legend(title = "State")) +
geom_line() +
theme_bw() +
labs(tag = "C") +
theme(panel.grid.major.y = element_line(colour = "lightgrey", size = (0.25)),
panel.grid.major.x = element_line(colour = "lightgrey", size = (0.25)),
panel.grid.minor.x = element_blank()) +
ylim(0, 1.5) +
theme(axis.text.x = element_text(size = 7)) +
theme(axis.text.y = element_text(size = 7)) +
theme(axis.title.x = element_text(size = 9)) +
theme(axis.title.y = element_text(size = 9)) +
scale_x_continuous(breaks = 13:22) +
labs(x = "Weeks from January 1", y = "Average vapour pressure deficit (kPa)")
Fig_S7.vpa.timeplot <- Fig_S7.vpa.timeplot + facet_grid(. ~ time_of_day_group, labeller = labeller(time_of_day_group = labels.facet))
Fig_S7.vpa.timeplot
Clearly a huge difference between mid-day and the other times
Describe mallard flight data
Calculate median distance travelled by mallards in North America, both overall and for trips that lasted less than a day
mallard.data$dist_m <- mallard.data$Distance..Km. * 1000 #convert to m
viana_median_dist_m <- median(mallard.data$dist_m)
viana_median_dist_m.within.day <- median(mallard.data$dist_m[mallard.data$Time.elapsed..days. == 0])
Get just distances travelled within a day:
mallard.data.within.day <- dplyr::filter(mallard.data, Time.elapsed..days. == 0)
Plot histogram of daily distance:
flight.dist.hist <- ggplot(data = mallard.data.within.day, aes(x = Distance..Km.)) +
theme_bw() +
# geom_density() +
geom_histogram(col = "black", fill="darkgrey",
alpha = .1, breaks = seq(0, 600, by = 100)) +
scale_x_continuous(breaks = seq(0, 600, by = 100)) +
theme(panel.grid.major.y = element_line(colour="lightgrey", size = (0.25)),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank()) +
labs(x="Distance travelled (km)", y = "Number of flights") +
theme(axis.text.x = element_text(size = 7)) +
theme(axis.text.y = element_text(size = 7)) +
theme(axis.title.x = element_text(size = 9)) +
theme(axis.title.y = element_text(size = 9)) +
annotate("text", x = 600, y = 24, label = "B")
flight.dist.hist
Diatom data
Import diatom data, using 2007 and 2012 NLA data. The latter year only provides occurrences of the genus Nitzschia, whereas the former year provides both the genus occurrences and those of species N. pusilla.
nla.2007.phyto.counts <- read.csv("../rawdata/diatom_lake_data/nla2007_phytoplankton_diatomcount_20091125.txt",
header = T, strip.white = T, stringsAsFactors = F, encoding = "latin1")
nla.2007.lake.data <- read.csv("../rawdata/diatom_lake_data/nla2007_sampledlakeinformation_20091113.txt",
header = T, strip.white = T, stringsAsFactors = F, encoding = "latin1");
nla.2012.lake.data <- read.csv("../rawdata/diatom_lake_data/nla2012_wide_siteinfo_08232016.txt",
header = T, strip.white = T, stringsAsFactors = F, encoding = "latin1")
nla.2012.phyto.counts <- read.csv("../rawdata/diatom_lake_data/nla2012_wide_phytoplankton_count_02122014.txt",
header = T, strip.white = T, stringsAsFactors = F, encoding = "latin1")
Nitzschia occurrence data
Subset to first visit samples (some lakes had multiple visits), and then make pres/abs table
temp <- nla.2007.phyto.counts %>%
filter(VISIT_NO == 1)
nla.2007.phyto.presabs <- table(temp$SITE_ID, temp$TAXANAME)
Identify lakes where the genus Nitzschia occurs, then where Nitzschia pusilla Grunow occurs
# genus
Nitzschia.2007.pres <- grep("Nitzschia", dimnames(nla.2007.phyto.presabs)[[2]])
# species
Nitzschia.pusilla.2007.pres <- grep("Nitzschia pusilla Grunow",
dimnames(nla.2007.phyto.presabs)[[2]])
Get lake names
Nitzschia.2007.lakes <- row.names(nla.2007.phyto.presabs[,Nitzschia.2007.pres][rowSums(nla.2007.phyto.presabs[,Nitzschia.2007.pres]) > 0,])
Nitzschia.2007.lake.info <- nla.2007.lake.data[match(Nitzschia.2007.lakes, nla.2007.lake.data$SITE_ID),]
Nitzschia.pusilla.2007.lakes <- names(nla.2007.phyto.presabs[,Nitzschia.pusilla.2007.pres][nla.2007.phyto.presabs[,Nitzschia.pusilla.2007.pres] > 0])
Nitzschia.pusilla.2007.lake.info <- nla.2007.lake.data[match(Nitzschia.pusilla.2007.lakes, nla.2007.lake.data$SITE_ID),]
Now 2012 data Subset to first visit samples, and then make pres/abs table
temp <- nla.2012.phyto.counts %>%
filter(VISIT_NO == 1)
nla.2012.phyto.presabs <- table(temp$SITE_ID, temp$TARGET_TAXON)
Now for 2012 data, identify lakes where the genus Nitzschia occurs, then where Nitzschia pusilla Grunow occurs
# genus
Nitzschia.2012.pres <- grep("NITZSCHIA", dimnames(nla.2012.phyto.presabs)[[2]])
Get lake names
Nitzschia.2012.lakes <- row.names(nla.2012.phyto.presabs[,Nitzschia.2012.pres][rowSums(nla.2012.phyto.presabs[,Nitzschia.2012.pres]) > 0,])
Nitzschia.2012.lake.info <- nla.2012.lake.data[match(Nitzschia.2012.lakes, nla.2012.lake.data$SITE_ID),]
Now export to file for backup.
write.csv(Nitzschia.2007.lake.info[,c("SITE_ID", "DATE_COL", "LON_DD", "LAT_DD")],
"../output/nitzschia_2007_occurrences_NLA.csv")
write.csv(Nitzschia.pusilla.2007.lake.info[,c("SITE_ID", "DATE_COL", "LON_DD", "LAT_DD")], "../output/nitzschia_pusella_2007_occurrences_NLA.csv")
write.csv(Nitzschia.2012.lake.info[,c("SITE_ID", "DATE_COL", "LAT_DD83", "LON_DD83")],
"../output/nitzschia_2012_occurrences_NLA.csv")
Now, need to project lat/long for mapping, and the data frame to SF format, see this tutorial here https://datacarpentry.org/r-raster-vector-geospatial/10-vector-csv-to-shapefile-in-r/index.html.
We will assume the Lat/Long information in the CSV file is NAD83.
Nitzschia.locations.2007.sf <- sf::st_as_sf(Nitzschia.2007.lake.info[,
c("SITE_ID", "DATE_COL", "LON_DD", "LAT_DD")],
coords = c("LON_DD", "LAT_DD"),
crs = "+proj=longlat +ellps=GRS80 +datum=NAD83 +no_defs")
diatom.locations.2012.sf <- sf::st_as_sf(Nitzschia.2012.lake.info[,
c("SITE_ID", "DATE_COL", "LAT_DD83", "LON_DD83")],
coords = c("LON_DD83", "LAT_DD83"),
crs = "+proj=longlat +ellps=GRS80 +datum=NAD83 +no_defs")
pusilla.locations.2007.sf <- sf::st_as_sf(Nitzschia.pusilla.2007.lake.info[,
c("SITE_ID", "DATE_COL", "LON_DD", "LAT_DD")],
coords = c("LON_DD", "LAT_DD"),
crs = "+proj=longlat +ellps=GRS80 +datum=NAD83 +no_defs")
Diatom data projection, then intersect with study area
Nitzschia.locations.2007.projected <- sf::st_transform(Nitzschia.locations.2007.sf,
sf::st_crs(study.area.us.projected))
Nitzschia.locations.2012.projected <- sf::st_transform(diatom.locations.2012.sf,
sf::st_crs(study.area.us.projected))
pusilla.locations.2007.projected <- sf::st_transform(pusilla.locations.2007.sf,
sf::st_crs(study.area.us.projected))
Clip to study region
Nitzschia.temp <- sf::st_intersects(Nitzschia.locations.2007.projected, study.area.us.projected)
Nitzschia.locations.2007.cropped <- Nitzschia.locations.2007.projected[
apply(Nitzschia.temp, 1, function(x){sum(x)}) > 0,]
Nitzschia.temp <- sf::st_intersects(Nitzschia.locations.2012.projected, study.area.us.projected)
Nitzschia.locations.2012.cropped <- Nitzschia.locations.2012.projected[
apply(Nitzschia.temp, 1, function(x){sum(x)}) > 0,]
pusilla.temp <- sf::st_intersects(pusilla.locations.2007.projected, study.area.us.projected)
pusilla.locations.2007.cropped <- pusilla.locations.2007.projected[
apply(pusilla.temp, 1, function(x){sum(x)}) > 0,]
Combine the Nitzschia genus locations from 2007 and 2012 NLA data:
Nitzschia.locations <- rbind(Nitzschia.locations.2007.cropped,
Nitzschia.locations.2012.cropped)
Spatial predictions
For spatial predictions, we need the following raster layers:
- vapour pressure deficit layer
- a flight time raster layer, with pixels representing flight time from source lakes (which will be the N. pusilla lakes)
We create the flight time layer using a distance layer.
Create VPD layer
For spatial interpolation of VPD across the region, we use beginning of May VPD data, which generally corresponds to week 18. Then get mean VPD by station for both dawn, mid-day, and dusk periods.
asos.data.week18.vpd <- asos.data.dawn.pm %>% dplyr::filter(week == 18) %>% dplyr::group_by(station, time_of_day_group) %>%
dplyr::summarise(mean.vpd = mean(vpd, na.rm = T))
asos.data.week18.vpd <- dplyr::left_join(asos.data.week18.vpd, asos.stations, by = c("station" = "id"))
Create geospatial SF object from VPD data, using appropriate CRS.
First do dawn hours:
asos.data.week18.vpd.sf <- sf::st_as_sf(asos.data.week18.vpd, coords = c("lon", "lat"),
crs = sf::st_crs(small.lakes))
asos.projected.week18.vpd <- sf::st_transform(asos.data.week18.vpd.sf, 102010)
Now create raster grid on which to predict VPD based on a nearest-neighbour model.
We’ll use a simple inverse distance weighted interpolation model: 3 nearest neighbors and a 2nd order polynomial. See the following online resources: https://rspatial.org/raster/analysis/4-interpolation.html#nearest-neighbour-interpolation.
This makes use of the sf, sp, gstat, and raster packages.
First create sample grid:
grd.vpd <- as.data.frame(sp::spsample(sf::as_Spatial(asos.projected.week18.vpd), "regular", n = 50000))
names(grd.vpd) <- c("X", "Y")
sp::coordinates(grd.vpd) <- c("X", "Y")
sp::gridded(grd.vpd) <- TRUE # Create SpatialPixel object
sp::fullgrid(grd.vpd) <- TRUE # Create SpatialGrid object
proj4string(grd.vpd) <- sp::proj4string(as_Spatial(asos.projected.week18.vpd))
grd.vpd.raster <- raster::raster(grd.vpd)
Build spatial models:
gs.dawn.vpd <- gstat::gstat(formula = mean.vpd ~ 1, data = dplyr::filter(asos.projected.week18.vpd, time_of_day_group == "Dawn"),
locations = as_Spatial(dplyr::filter(asos.projected.week18.vpd, time_of_day_group == "Dawn")),
nmax = 3, set = list(idp = 2))
gs.mid.vpd <- gstat::gstat(formula = mean.vpd ~ 1, data = dplyr::filter(asos.projected.week18.vpd, time_of_day_group == "Mid-day"),
locations = as_Spatial(dplyr::filter(asos.projected.week18.vpd, time_of_day_group == "Mid-day")),
nmax = 3, set = list(idp = 2))
gs.dusk.vpd <- gstat::gstat(formula = mean.vpd ~ 1, data = dplyr::filter(asos.projected.week18.vpd, time_of_day_group == "Dusk"),
locations = as_Spatial(dplyr::filter(asos.projected.week18.vpd, time_of_day_group == "Dusk")),
nmax = 3, set = list(idp = 2))
Interpolate dawn, mid-day, and dusk values, and create raster stack, with band1 = Dawn VPD, and band2 = mid-day and band3 = dusk VPD
gs.vpd.stack <- raster::stack(raster::interpolate(grd.vpd.raster, gs.dawn.vpd), raster::interpolate(grd.vpd.raster, gs.mid.vpd), raster::interpolate(grd.vpd.raster, gs.dusk.vpd))
## [inverse distance weighted interpolation]
## [inverse distance weighted interpolation]
## [inverse distance weighted interpolation]
proj4string(gs.vpd.stack) <- sp::proj4string(as_Spatial(asos.projected.week18.vpd))
Clip to study region
gs.vpd.stack.clip <- raster::mask(gs.vpd.stack, sf::as_Spatial(study.area.outer.border.projected))
Create raster dataframe using ggspatial package (required to map with ggplot)
dawn.vpd.raster.dataframe <- ggspatial::df_spatial(gs.vpd.stack.clip$var1.pred.1)
mid.vpd.raster.dataframe <- ggspatial::df_spatial(gs.vpd.stack.clip$var1.pred.2)
dusk.vpd.raster.dataframe <- ggspatial::df_spatial(gs.vpd.stack.clip$var1.pred.3)
Gather descriptive stats
summary(na.omit(dawn.vpd.raster.dataframe$band1))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.07439 0.17777 0.20781 0.20675 0.23511 0.40769
summary(na.omit(mid.vpd.raster.dataframe$band1))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.6976 1.1471 1.2500 1.2348 1.3320 1.5227
summary(na.omit(dusk.vpd.raster.dataframe$band1))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.3080 0.5155 0.5642 0.5520 0.5998 0.7776
Visualize the data:
VPD.dawn.map <- ggplot() +
geom_raster(data = dawn.vpd.raster.dataframe, aes(x = x, y = y, fill = band1)) +
# make no data values transparent
scale_fill_gradient("VPD (kPa)",
low = 'yellow', high = 'blue', limits = c(0.05, 1.75),
na.value = NA, guide = guide_colorbar(ticks.colour = "black",
direction = "horizontal", title.position = "top",
ticks.linewidth = 1.5,
title.hjust = .5, frame.colour = "black")) +
labs(title = "Dawn (04:00 - 07:00)") +
theme(plot.title = element_text(size = 7.5, face = "bold", hjust = 0.5, vjust = -6)) +
# theme(plot.tag.position = c(0.17, 0.95), plot.title.position = "panel") +
theme(legend.position = c(0.5, 1.1),
legend.title = element_text(size = 7.5, face = "bold"),
legend.text = element_text(size = 7),
legend.key.height = unit(0.4, "cm"), legend.key.width = unit(0.95, "cm")) +
geom_sf(data = study.area.us.projected, col = "grey", lwd = 1.2, fill = NA) +
geom_sf(data = dplyr::filter(asos.projected.week18.vpd, time_of_day_group == "Dawn"), col = "black", lwd = 1) +
labs(x = "Longitude", y = "Latitude") +
theme(axis.text.x = element_text(size = 7)) +
theme(axis.text.y = element_text(size = 7)) +
theme(axis.title.x = element_text(size = 9)) +
theme(axis.title.y = element_text(size = 9)) +
theme(plot.margin =margin(1, 1, 1, 1, "pt"))
VPD.dawn.map
Now mid-day:
VPD.mid.map <- ggplot() +
geom_raster(data = mid.vpd.raster.dataframe, aes(x = x, y = y, fill = band1)) +
# make no data values transparent
scale_fill_gradient("VPD (kPa)",
low = 'yellow', high = 'blue', limits = c(0.05, 1.75),
na.value = NA, guide = guide_colorbar(ticks.colour = "black",
direction = "horizontal", title.position = "top",
ticks.linewidth = 1.5,
title.hjust = .5, frame.colour = "black")) +
labs(title = "Mid-day (12:00 - 15:00)") +
theme(plot.title = element_text(size = 7.5, face = "bold", hjust = 0.5, vjust = -6)) +
# theme(plot.tag.position = c(0.17, 0.95), plot.title.position = "panel") +
theme(legend.position = c(0.5, 1.1),
legend.title = element_text(size = 7.5, face = "bold"),
legend.text = element_text(size = 7),
legend.key.height = unit(0.4, "cm"), legend.key.width = unit(0.95, "cm")) +
geom_sf(data = study.area.us.projected, col = "grey", lwd = 1.2, fill = NA) +
geom_sf(data = dplyr::filter(asos.projected.week18.vpd, time_of_day_group == "Mid-day"), col = "black", lwd = 1) +
labs(x = "Longitude", y = "Latitude") +
theme(axis.text.x = element_text(size = 7)) +
theme(axis.text.y = element_blank()) +
theme(axis.title.x = element_text(size = 9)) +
theme(axis.title.y = element_blank()) +
theme(plot.margin =margin(1, 1, 1, 1, "pt"))
VPD.mid.map
And dusk:
VPD.dusk.map <- ggplot() +
geom_raster(data = dusk.vpd.raster.dataframe, aes(x = x, y = y, fill = band1)) +
# make no data values transparent
scale_fill_gradient("VPD (kPa)",
low = 'yellow', high = 'blue', limits = c(0.05, 1.75),
na.value = NA, guide = guide_colorbar(ticks.colour = "black",
direction = "horizontal", title.position = "top",
ticks.linewidth = 1.5,
title.hjust = .5, frame.colour = "black")) +
labs(title = "Dusk (20:00 - 23:00)") +
theme(plot.title = element_text(size = 7.5, face = "bold", hjust = 0.5, vjust = -6)) +
# theme(plot.tag.position = c(0.17, 0.95), plot.title.position = "panel") +
theme(legend.position = c(0.5, 1.1),
legend.title = element_text(size = 7.5, face = "bold"),
legend.text = element_text(size = 7),
legend.key.height = unit(0.4, "cm"), legend.key.width = unit(0.95, "cm")) +
geom_sf(data = study.area.us.projected, col = "grey", lwd = 1.2, fill = NA) +
geom_sf(data = dplyr::filter(asos.projected.week18.vpd, time_of_day_group == "Dusk"), col = "black", lwd = 1) +
labs(x = "Longitude", y = "Latitude") +
theme(axis.text.x = element_text(size = 7)) +
theme(axis.text.y = element_blank()) +
theme(axis.title.x = element_text(size = 9)) +
theme(axis.title.y = element_blank()) +
theme(plot.margin =margin(1, 1, 1, 1, "pt"))
VPD.dusk.map
Create flight time layer
We’ll use the distanceFromPoints raster function to calculate distance from all points of interest
First, exclude any lakes that are outside of the bounding box of the raster:
# first use relh.clip
pusilla.locations.2007.cropped.to.raster <- pusilla.locations.2007.cropped[
st_coordinates(pusilla.locations.2007.cropped)[,1] > sp::bbox(gs.vpd.stack.clip)[1,1],]
Nitzschia.locations.cropped.to.raster <- Nitzschia.locations[
st_coordinates(Nitzschia.locations)[,1] > sp::bbox(gs.vpd.stack.clip)[1,1],]
Use the N. pusilla occurrence lakes as source lakes.
pusilla.raster <- raster::distanceFromPoints(gs.vpd.stack.clip, as_Spatial(
pusilla.locations.2007.cropped.to.raster))
pusilla.raster <- raster::mask(pusilla.raster, as_Spatial(study.area.outer.border.projected))
Convert to km
pusilla.raster.km <- pusilla.raster/1000
So now we can use this raster to calculate a new raster representing the time (in minutes) from each source lake, based on flight speed of 69 km/h.
pusilla.raster.flight.time <- (pusilla.raster.km / 69) * 60
Create potential dispersal raster
Now with the flight time and the VPD raster layers, we can use the predict raster function to predict probability of potential dispersal.
First create a raster stack with the two variables (layers)
vpd.dawn.flight.stack <- raster::stack(gs.vpd.stack.clip$var1.pred.1, pusilla.raster.flight.time)
names(vpd.dawn.flight.stack) <- c("vpd", "time_min_num")
vpd.mid.flight.stack <- raster::stack(gs.vpd.stack.clip$var1.pred.2, pusilla.raster.flight.time)
names(vpd.mid.flight.stack) <- c("vpd", "time_min_num")
vpd.dusk.flight.stack <- raster::stack(gs.vpd.stack.clip$var1.pred.3, pusilla.raster.flight.time)
names(vpd.dusk.flight.stack) <- c("vpd", "time_min_num")
Now make a probability of potential dispersal layer based on the GLM:
dispersal.dawn.prob.raster <- raster::predict(vpd.dawn.flight.stack, glm.full.vpd, type = "response")
dispersal.mid.prob.raster <- raster::predict(vpd.mid.flight.stack, glm.full.vpd, type = "response")
dispersal.dusk.prob.raster <- raster::predict(vpd.dusk.flight.stack, glm.full.vpd, type = "response")
Convert the raster to a dataframe using ggspatial function df_spatial
dispersal.dawn.prob.dataframe <- ggspatial::df_spatial(dispersal.dawn.prob.raster)
dispersal.mid.prob.dataframe <- ggspatial::df_spatial(dispersal.mid.prob.raster)
dispersal.dusk.prob.dataframe <- ggspatial::df_spatial(dispersal.dusk.prob.raster)
Create final map figures
Create overview map
overview.map <- ggplot() +
theme_bw() +
geom_sf(data = north.america.lamb, col = "grey", fill = NA, alpha = 0.4) +
geom_sf(data = north.america.lamb.outer, fill = NA, col = "darkgrey") +
geom_sf(data = study.area.outer.border, col = "black", fill = NA) +
coord_sf(datum = NA) +
theme(plot.margin =margin(1, 1, 1, 1, "pt"))
overview.map

Create dispersal probability map, showing probability of successful dispersal from source N. pusilla lakes.
NOTE The color scales are situated outside the map area, and don’t show up. This is on purpose. When we assemble the three maps into a single figure (at bottom of script), we’ll reveal the colour scale!
First for dawn:
disp.prob.map.dawn <- ggplot() +
theme_bw() +
geom_tile(data = dispersal.dawn.prob.dataframe, aes(x = x, y = y, fill = band1)) +
scale_fill_gradient(low = 'yellow', high = 'darkgreen',limits = c(0, 1), breaks = c(0, 0.2, 0.4, 0.6, 0.8, 1.0),
na.value = NA, guide = guide_colorbar(
title = "Probability of potential dispersal",
direction = "horizontal", title.position = "top",
ticks.colour = "black", ticks.linewidth = 1.5,
title.hjust = .5, frame.colour = "black")) +
labs(title = "Dawn (04:00 - 07:00)") +
theme(plot.title = element_text(size = 7.5, face = "bold", hjust = 0.5, vjust = -8)) +
# theme(plot.tag.position = c(0.17, 0.95), plot.title.position = "panel") +
theme(legend.position = c(0.5, 1.1),
legend.title = element_text(size = 7.5, face = "bold"),
legend.text = element_text(size = 7),
legend.key.height = unit(0.4, "cm"), legend.key.width = unit(0.95, "cm")) +
geom_sf(data = study.area.us.projected, fill = NA) +
geom_sf(data = study.area.lakes.projected, col = "blue") +
geom_sf(data = Nitzschia.locations, shape = 5, fill = "blue", size = 2.2, lwd = 1.5) +
geom_sf(data = pusilla.locations.2007.cropped, shape = 23, size = 2.3, fill = "red") +
coord_sf(xlim = c(-650000, 80000), ylim = c(-10000, 1000000)) +
labs(x = "Longitude", y = "Latitude") +
theme(axis.text.x = element_text(size = 7)) +
theme(axis.text.y = element_text(size = 7)) +
theme(axis.title.x = element_text(size = 9)) +
theme(axis.title.y = element_text(size = 9)) +
theme(plot.margin =margin(1, 1, 1, 1, "pt"))
## Warning: Duplicated aesthetics after name standardisation: size
disp.prob.map.dawn <- disp.prob.map.dawn +
ggsn::scalebar(study.area.us.projected, st.size = 1.5,
dist = 50, transform = F, border.size = 0.2,
dist_unit = "km", model = "GRS80", location = "bottomleft",
box.fill = c("darkgrey", "white"))
disp.prob.map.dawn
Now for mid-day:
disp.prob.map.mid <- ggplot() +
theme_bw() +
geom_tile(data = dispersal.mid.prob.dataframe, aes(x = x, y = y, fill = band1)) +
scale_fill_gradient(low = 'yellow', high = 'darkgreen',limits = c(0, 1), breaks = c(0, 0.2, 0.4, 0.6, 0.8, 1.0),
na.value = NA, guide = guide_colorbar(
title = "Probability of potential dispersal",
direction = "horizontal", title.position = "top",
ticks.colour = "black", ticks.linewidth = 1.5,
title.hjust = .5, frame.colour = "black")) +
labs(title = "Mid-day (12:00 - 15:00)") +
theme(plot.title = element_text(size = 7.5, face = "bold", hjust = 0.5, vjust = -8)) +
# theme(plot.tag.position = c(0.17, 0.95), plot.title.position = "panel") +
theme(legend.position = c(0.5, 1.1),
legend.title = element_text(size = 7.5, face = "bold"),
legend.text = element_text(size = 7),
legend.key.height = unit(0.4, "cm"), legend.key.width = unit(0.95, "cm")) +
geom_sf(data = study.area.us.projected, fill = NA) +
geom_sf(data = study.area.lakes.projected, col = "blue") +
geom_sf(data = Nitzschia.locations, shape = 5, fill = "blue", size = 2.2, lwd = 1.5) +
geom_sf(data = pusilla.locations.2007.cropped, shape = 23, size = 2.3, fill = "red") +
coord_sf(xlim = c(-650000, 80000), ylim = c(-10000, 1000000)) +
labs(x = "Longitude", y = "Latitude") +
theme(axis.text.x = element_text(size = 7)) +
theme(axis.text.y = element_blank()) +
theme(axis.title.x = element_text(size = 9)) +
theme(axis.title.y = element_blank()) +
theme(plot.margin =margin(1, 1, 1, 1, "pt"))
## Warning: Duplicated aesthetics after name standardisation: size
disp.prob.map.mid <- disp.prob.map.mid +
ggsn::scalebar(study.area.us.projected, st.size = 1.5,
dist = 50, transform = F, border.size = 0.2,
dist_unit = "km", model = "GRS80", location = "bottomleft",
box.fill = c("darkgrey", "white"))
disp.prob.map.mid
And dusk:
disp.prob.map.dusk <- ggplot() +
theme_bw() +
geom_tile(data = dispersal.dusk.prob.dataframe, aes(x = x, y = y, fill = band1)) +
scale_fill_gradient(low = 'yellow', high = 'darkgreen',limits = c(0, 1), breaks = c(0, 0.2, 0.4, 0.6, 0.8, 1.0),
na.value = NA, guide = guide_colorbar(
title = "Probability of potential dispersal",
direction = "horizontal", title.position = "top",
ticks.colour = "black", ticks.linewidth = 1.5,
title.hjust = .5, frame.colour = "black")) +
labs(title = "Dusk (20:00 - 23:00)") +
theme(plot.title = element_text(size = 7.5, face = "bold", hjust = 0.5, vjust = -8)) +
# theme(plot.tag.position = c(0.17, 0.95), plot.title.position = "panel") +
theme(legend.position = c(0.5, 1.1),
legend.title = element_text(size = 7.5, face = "bold"),
legend.text = element_text(size = 7),
legend.key.height = unit(0.4, "cm"), legend.key.width = unit(0.95, "cm")) +
geom_sf(data = study.area.us.projected, fill = NA) +
geom_sf(data = study.area.lakes.projected, col = "blue") +
geom_sf(data = Nitzschia.locations, shape = 5, fill = "blue", size = 2.2, lwd = 1.5) +
geom_sf(data = pusilla.locations.2007.cropped, shape = 23, size = 2.3, fill = "red") +
coord_sf(xlim = c(-650000, 80000), ylim = c(-10000, 1000000)) +
labs(x = "Longitude", y = "Latitude") +
theme(axis.text.x = element_text(size = 7)) +
theme(axis.text.y = element_blank()) +
theme(axis.title.x = element_text(size = 9)) +
theme(axis.title.y = element_blank()) +
theme(plot.margin =margin(1, 1, 1, 1, "pt"))
## Warning: Duplicated aesthetics after name standardisation: size
disp.prob.map.dusk <- disp.prob.map.dusk +
ggsn::scalebar(study.area.us.projected, st.size = 1.5,
dist = 50, transform = F, border.size = 0.2,
dist_unit = "km", model = "GRS80", location = "bottomleft",
box.fill = c("darkgrey", "white"))
disp.prob.map.dusk
Now extract the dispersal probability values for each Nitzschia occurrence lake, and visualize with a histogram.
For this we’ll exclude the N. pusilla lakes, as these are considered the sources.
all.Nitzschia.lakes.minus.pusilla <- Nitzschia.locations[match(Nitzschia.locations$SITE_ID, setdiff(Nitzschia.locations$SITE_ID, pusilla.locations.2007.cropped$SITE_ID))[!is.na(match(Nitzschia.locations$SITE_ID, setdiff(Nitzschia.locations$SITE_ID, pusilla.locations.2007.cropped$SITE_ID)))],]
Get probabilities:
Nitzschia.lake.disp.probs <- data.frame(x = st_coordinates(
all.Nitzschia.lakes.minus.pusilla)[,1], y = st_coordinates(
all.Nitzschia.lakes.minus.pusilla)[,2],
time = factor(rep(c("Dawn", "Mid-day", "Dusk"), each = length(raster::extract(dispersal.dawn.prob.raster, as_Spatial(all.Nitzschia.lakes.minus.pusilla)))), levels = c("Dawn", "Mid-day", "Dusk")),
dp = c(raster::extract(dispersal.dawn.prob.raster, as_Spatial(all.Nitzschia.lakes.minus.pusilla)),
raster::extract(dispersal.mid.prob.raster, as_Spatial(all.Nitzschia.lakes.minus.pusilla)),
raster::extract(dispersal.dusk.prob.raster, as_Spatial(all.Nitzschia.lakes.minus.pusilla))))
## Warning in data.frame(x = st_coordinates(all.Nitzschia.lakes.minus.pusilla)
## [, : row names were found from a short variable and have been discarded
Now a boxplot comparing dispersal probabilities across times of day:
Fig5.dp.boxplot <- ggplot(Nitzschia.lake.disp.probs, aes(y = dp, x = time)) +
theme_bw() +
geom_violin(col = "black") +
geom_jitter(col = "grey", alpha = 0.5, width = 0.2) +
ylim(0, 1) +
theme(panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank(), panel.grid.major.y = element_line(colour = "lightgrey", size = (0.25)),
panel.grid.major.x = element_blank(), plot.tag.position = c(0.1, 0.95)) +
labs(y = "Probability of potential dispersal", x = "Time of day") +
theme(axis.text.x = element_text(size = 7)) +
theme(axis.text.y = element_text(size = 7)) +
theme(axis.title.x = element_text(size = 9)) +
theme(axis.title.y = element_text(size = 9))
Fig5.dp.boxplot